#define vec2 float2
#define vec4 float4
#define rgb xyz
#define rgba xyzw

#define PI 3.1415926535897932f

// play with these parameters to custimize the effect
// ===================================================

//speed


// ===================================================
static vec2 myMod(vec2 x, float y)
{
	return x-y * floor (x/y);
}

static vec2 mirror(vec2 uv)
{
	vec2 parity = myMod(floor(uv),2.0f);
	vec2 sign = -2.0f*parity+1.0f;
	return myMod( 2.0f*parity+sign*myMod(uv,1.0f),1.0f);
}

   float col(vec2 coord,int steps, float time_speed_x, float time_speed_y,float time_speed)
  {
    float colour = 0.0f;
    float theta = 0.0f;
	float delta_theta = 0.8975979010256551f;
    for (int i = 0; i < steps; i++)
    {
		float frequency = 6.0f;
      vec2 adjc = coord;
      theta = delta_theta*(float)(i);
      adjc.x += cos(theta)*time_speed + time_speed_x;
      adjc.y -= sin(theta)*time_speed - time_speed_y;
      colour = colour + cos( (adjc.x*cos(theta) - adjc.y*sin(theta))*frequency)*2.4f;
    }

    return cos(colour);
  }

//---------- main


const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP_TO_EDGE |  CLK_FILTER_LINEAR;

static vec4 INPUT(image2d_t src_data, vec2 tc, __global FilterParam* param)
{
	tc = (vec2)(tc.x, tc.y)*(vec2)(param->origROI[2], param->origROI[3]) + (vec2)(param->origROI[0], param->origROI[1]);
	return read_imagef(src_data, sampler, (vec2)(tc.x, tc.y) ).zyxw;
}

__kernel void MAIN(
      __read_only image2d_t src_data,
      __write_only image2d_t dest_data,        //Data in global memory
	  __global FilterParam* param,
	  float emboss,
	  int steps, 
	  int strength)  		// the gpu items/threads should be newW*newH
{	
	float global_time = param->cur_time;
	float speed = 0.2f;
	float speed_x = 0.3f;
	float speed_y = 0.3f;
	float reflectionCutOff = 0.012f;
	float reflectionIntence = 200000.0f;
	float delta = 60.0f;
	float u_time = global_time;
	float time = u_time * 1.3f;
	float time_speed = time * speed;
	float time_speed_x = time * speed_x;
	float time_speed_y = time * speed_y;
	float intence = 700.0f;
	
	float2 iResolution = (float2)(get_global_size(0),get_global_size(1));
	int2 coordinate = (int2)(get_global_id0( param), get_global_id1( param));
	vec2 fragCoord = (vec2)(get_global_id0( param), get_global_id1( param));
	vec2 tc = (vec2)(fragCoord.x + 0.5f, fragCoord.y + 0.5f)/iResolution.xy;
	
	vec2 p = tc, c1 = p, c2 = p;
	float cc1 = col(c1,steps, time_speed_x,time_speed_y,time_speed);

	c2.x += iResolution.x/delta;
	float dx = emboss*(cc1-col(c2,steps, time_speed_x,time_speed_y,time_speed))/delta;

	c2.x = p.x;
	c2.y += iResolution.y/delta;
	float dy = emboss*(cc1-col(c2,steps, time_speed_x,time_speed_y,time_speed))/delta;

	c1.x += dx*2.0f;
	c1.y = c1.y+dy*2.0f;

	float alpha = 1.0f+dot(dx,dy)*intence;
		
	float ddx = dx - reflectionCutOff;
	float ddy = dy - reflectionCutOff;
	if (ddx > 0.0f && ddy > 0.0f)
		alpha = pow(alpha, ddx*ddy*reflectionIntence);
		
	vec4 tuneColor = INPUT(src_data, mirror(c1), param)*(alpha);
	vec4 orig = INPUT(src_data,tc, param);
	
	float factor = strength/100.0f;
	float4 outputColor = tuneColor * factor + orig * (1.0f - factor);
	
	write_imagef(dest_data, (int2)(get_global_id(0), get_global_id(1)), (vec4)(outputColor.zyx, orig.w));
}

